This project uses images of daisies, dandelions, roses, sunflowers, and tulips from flickr and google images in an attempt to predict which species a flower is from a 224x224 pixel image.

library(keras)
library(tensorflow)

gpu <- tf$config$experimental$get_visible_devices('GPU')[[1]]
tf$config$experimental$set_memory_growth(device = gpu, enable = TRUE)
base_dir <- "C:/Users/ian/Documents/Projects/flowers_final"

train_dir <- file.path(base_dir, "train")
validation_dir <- file.path(base_dir, "validation")
test_dir <- file.path(base_dir, "test")

Data Pipeline & Pre-Processing

  • 3120 training images

  • 600 validation images

  • 600 test images

  • 768 daisies

  • 1049 dandelions

  • 784 roses

  • 734 sunflowers

  • 984 tulips

Images were split into an 80% training set and a 20% test set then the training set was split again at the same ratio to the validation set.

train_datagen <- image_data_generator(rescale = 1/255)
validation_datagen <- image_data_generator(rescale = 1/255)

train_generator <- flow_images_from_directory(
  train_dir,
  train_datagen,
  target_size = c(224, 224),
  batch_size = 20,
  class_mode = "categorical"
)

validation_generator <- flow_images_from_directory(
  validation_dir,
  validation_datagen,
  target_size = c(224, 224),
  batch_size = 20,
  class_mode = "categorical"
)

batch <- generator_next(train_generator)
str(batch)
## List of 2
##  $ : num [1:20, 1:224, 1:224, 1:3] 0.847 0 0.584 1 0.227 ...
##  $ : num [1:20, 1:5] 1 0 0 0 0 0 0 1 0 0 ...
head(batch[[2]]) # labels are a one-hot encoded matrix with one column for each class
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    0    0    0    0
## [2,]    0    1    0    0    0
## [3,]    0    0    0    1    0
## [4,]    0    0    0    1    0
## [5,]    0    0    0    0    1
## [6,]    0    1    0    0    0

Image Preview

Some images in the dataset have people or bugs in the way of the flowers, however most photos have the flower in the foreground.

imgs = batch[[1]][1:8,,,]
str(imgs)
##  num [1:8, 1:224, 1:224, 1:3] 0.847 0 0.584 1 0.227 ...
{op <- par(mfrow = c(2, 4), pty = "s", mar = c(1, 0, 1, 0))
for (i in 1:8) {
plot(as.raster(imgs[i,,,]))
}
par(op)}

Convolution Neural Network

Building The Network

We begin by building a network with 3 convolution and max pooling layers before flattening and sending the output to a densely connected layer.

model <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = c(3, 3), activation = "relu", input_shape = c(224, 224, 3)) %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_flatten() %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 5, activation = "softmax")

model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_adam(lr = 1e-4),
  metrics = c("accuracy")
)

model
## Model
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## conv2d (Conv2D)                     (None, 222, 222, 32)            896         
## ________________________________________________________________________________
## max_pooling2d (MaxPooling2D)        (None, 111, 111, 32)            0           
## ________________________________________________________________________________
## conv2d_1 (Conv2D)                   (None, 109, 109, 64)            18496       
## ________________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D)      (None, 54, 54, 64)              0           
## ________________________________________________________________________________
## conv2d_2 (Conv2D)                   (None, 52, 52, 64)              36928       
## ________________________________________________________________________________
## max_pooling2d_2 (MaxPooling2D)      (None, 26, 26, 64)              0           
## ________________________________________________________________________________
## conv2d_3 (Conv2D)                   (None, 24, 24, 128)             73856       
## ________________________________________________________________________________
## max_pooling2d_3 (MaxPooling2D)      (None, 12, 12, 128)             0           
## ________________________________________________________________________________
## flatten (Flatten)                   (None, 18432)                   0           
## ________________________________________________________________________________
## dense (Dense)                       (None, 256)                     4718848     
## ________________________________________________________________________________
## dense_1 (Dense)                     (None, 5)                       1285        
## ================================================================================
## Total params: 4,850,309
## Trainable params: 4,850,309
## Non-trainable params: 0
## ________________________________________________________________________________
model %>% save_model_hdf5('base_model.h5')

Training The Network

The model appears to overfit the dataset very quickly, this is likely due to the small number of training images.

model <- load_model_hdf5('base_model.h5')

history <- model %>% fit_generator(
  train_generator,
  steps_per_epoch = 156,
  epochs = 30,
  validation_data = validation_generator,
  validation_steps = 30, 
  verbose = 0
)

plot(history)
## `geom_smooth()` using formula 'y ~ x'

model %>% save_model_hdf5('base_model.h5')

Data Augmentation

To improve model performance we can try augmenting the images. Here we create a data pipeline to feed batches of the photos to the model rather than all at once to save memory and compute more efficiently.

datagen <- image_data_generator(
  rescale = 1/255,
  rotation_range = 40,
  width_shift_range = 0.2,
  height_shift_range = 0.2,
  shear_range = 0.2,
  zoom_range = 0.2,
  horizontal_flip = TRUE
)

test_datagen <- image_data_generator(rescale = 1/255)

train_generator <- flow_images_from_directory(
  train_dir,
  datagen,
  target_size = c(224, 224),
  batch_size = 20,
  class_mode = "categorical"
)

validation_generator <- flow_images_from_directory(
  validation_dir,
  test_datagen,
  target_size = c(224, 224),
  batch_size = 20,
  class_mode = "categorical"
)

Preview Augmented Images

train_tulips_dir = "C:/Users/ian/Documents/Projects/flowers_final/train/tulip"

fnames <- list.files(train_tulips_dir, full.names = TRUE)
img_path <- fnames[[3]]
img <- image_load(img_path, target_size = c(224, 224))
img_array <- image_to_array(img)
img_array <- array_reshape(img_array, c(1, 224, 224, 3))

augmentation_generator <- flow_images_from_data(
  img_array,
  generator = datagen,
  batch_size = 1
)

{op <- par(mfrow = c(2, 2), pty = "s", mar = c(1, 0, 1, 0))
for (i in 1:4) {
batch <- generator_next(augmentation_generator)
plot(as.raster(batch[1,,,]))
}
par(op)}

Building The Model

Here I reuse the model from before, it appears to give the decent results compared to smaller and larger models.

model <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = c(3, 3), activation = "relu", input_shape = c(224, 224, 3)) %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_conv_2d(filters = 64, kernel_size = c(3, 3), activation = "relu") %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_conv_2d(filters = 128, kernel_size = c(3, 3), activation = "relu") %>%
  layer_max_pooling_2d(pool_size = c(2, 2)) %>%
  layer_flatten() %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 5, activation = "softmax")

model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_adam(lr = 1e-4),
  metrics = c("accuracy")
)

model
## Model
## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## conv2d_4 (Conv2D)                   (None, 222, 222, 32)            896         
## ________________________________________________________________________________
## max_pooling2d_4 (MaxPooling2D)      (None, 111, 111, 32)            0           
## ________________________________________________________________________________
## conv2d_5 (Conv2D)                   (None, 109, 109, 64)            18496       
## ________________________________________________________________________________
## max_pooling2d_5 (MaxPooling2D)      (None, 54, 54, 64)              0           
## ________________________________________________________________________________
## conv2d_6 (Conv2D)                   (None, 52, 52, 64)              36928       
## ________________________________________________________________________________
## max_pooling2d_6 (MaxPooling2D)      (None, 26, 26, 64)              0           
## ________________________________________________________________________________
## conv2d_7 (Conv2D)                   (None, 24, 24, 128)             73856       
## ________________________________________________________________________________
## max_pooling2d_7 (MaxPooling2D)      (None, 12, 12, 128)             0           
## ________________________________________________________________________________
## flatten_1 (Flatten)                 (None, 18432)                   0           
## ________________________________________________________________________________
## dense_2 (Dense)                     (None, 256)                     4718848     
## ________________________________________________________________________________
## dense_3 (Dense)                     (None, 5)                       1285        
## ================================================================================
## Total params: 4,850,309
## Trainable params: 4,850,309
## Non-trainable params: 0
## ________________________________________________________________________________
model %>% save_model_hdf5('aug_model.h5')

Training The Model

The results here look a lot better the overfitting is not as severe as before and we get a modest increase in accuracy.

model <- load_model_hdf5('aug_model.h5')

history <- model %>% fit_generator(
  train_generator,
  steps_per_epoch = 156,
  epochs = 30,
  validation_data = validation_generator,
  validation_steps = 30,
  verbose=0
)

model %>% save_model_hdf5('aug_model.h5')

plot(history)
## `geom_smooth()` using formula 'y ~ x'

Using a Pretrained Convnet

Here we use a pretrained network for feature extraction and fine tuning. This model uses the vgg16 architecture trained on the imagenet dataset to increase the models performance.

Pre- Processing

For clarity we re-define the data pipeline. In addition I reduced the batch size to prevent crashing my gpu since this is a model takes up a lot of memory.

datagen <- image_data_generator(
  rescale = 1/255,
  rotation_range = 40,
  width_shift_range = 0.2,
  height_shift_range = 0.2,
  shear_range = 0.2,
  zoom_range = 0.2,
  horizontal_flip = TRUE
)

test_datagen <- image_data_generator(rescale = 1/255)

train_generator <- flow_images_from_directory(
  train_dir,
  datagen,
  target_size = c(224, 224),
  batch_size = 1,
  class_mode = "categorical"
)

validation_generator <- flow_images_from_directory(
  validation_dir,
  test_datagen,
  target_size = c(224, 224),
  batch_size = 1,
  class_mode = "categorical"
)

Feature Extraction & Fine-Tuning With Data Augmentation

First we freeze the weights in the vgg16 base, then we train the densely connected layer on top. After we unfreeze the last few layers of the vgg16 base and train again to fine tune the models performance.

conv_base <- application_vgg16(
  weights = "imagenet",
  include_top = FALSE,
  input_shape = c(224, 224, 3)
)

model <- keras_model_sequential() %>%
  conv_base %>%
  layer_flatten() %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 5, activation = "softmax")

freeze_weights(conv_base) # freeze weights to perform feature extraction

model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_adam(lr = 2e-5),
  metrics = c("accuracy")
)

model
## Model
## Model: "sequential_2"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## vgg16 (Model)                       (None, 7, 7, 512)               14714688    
## ________________________________________________________________________________
## flatten_2 (Flatten)                 (None, 25088)                   0           
## ________________________________________________________________________________
## dense_4 (Dense)                     (None, 256)                     6422784     
## ________________________________________________________________________________
## dense_5 (Dense)                     (None, 5)                       1285        
## ================================================================================
## Total params: 21,138,757
## Trainable params: 6,424,069
## Non-trainable params: 14,714,688
## ________________________________________________________________________________
# Performs feature extraction
history <- model %>% fit_generator(
  train_generator,
  steps_per_epoch = 1320,
  epochs = 30,
  validation_data = validation_generator,
  validation_steps = 600,
  verbose = 0
)

plot(history) 
## `geom_smooth()` using formula 'y ~ x'

# fine tuning
unfreeze_weights(conv_base, from = "block5_conv1") # unfreeze lower portion of vgg16 for tuning

model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_adam(lr = 1e-5),
  metrics = c("accuracy")
)

history <- model %>% fit_generator(
  train_generator,
  steps_per_epoch = 1320,
  epochs = 30,
  validation_data = validation_generator,
  validation_steps = 600,
  verbose = 0
)

plot(history)
## `geom_smooth()` using formula 'y ~ x'

model %>% save_model_hdf5('tuned_model.h5')

Evaluating The Final Model

The final test accuracy is almost 90%, this is a massive improvement over the previous models we tried.

model <- load_model_hdf5('tuned_model.h5')

test_generator <- flow_images_from_directory(
  test_dir,
  test_datagen,
  target_size = c(224, 224),
  batch_size = 1,
  class_mode = "categorical"
)

model %>% evaluate_generator(test_generator, steps = 600)
##      loss  accuracy 
## 0.4092226 0.8966666

Here we plot a few images and their corresponding prediction to verify the model is working.

  • 1 = daisy
  • 2 = dandelion
  • 3 = rose
  • 4 = sunflower
  • 5 = tulip

It appears the model is indeed working and giving pretty impresive results, even with a limited number of training images we are still able to achieve a high level of accuracy. Additional tuning may be able to increase performance even more but these were the best results I was able to obtain after a few hours of tuning on my personal computer.

test_gen <- flow_images_from_directory(
  test_dir,
  test_datagen,
  target_size = c(224, 224),
  batch_size = 6,
  class_mode = "categorical"
)

batch = generator_next(test_gen)

test_imgs = batch[[1]][1:6,,,]
test_lab = batch[[2]]

preds = predict(model, test_imgs)

mat = NULL
for (i in 1:dim(test_imgs)[1]){
  mat = data.frame(rbind(mat, c(preds[i, which.max(preds[i,])], which.max(preds[i,]), which.max(test_lab[i,]))))
}

for (i in 1:nrow(mat)){
  for (j in 2:3){
  if (mat[i,j] == 1){
        mat[i,j] = 'daisy'
  } else if (mat[i,j] == 2){
        mat[i,j] = 'dandelion'
  } else if (mat[i,j] == 3){
        mat[i,j] = 'rose'
  } else if (mat[i,j] == 4){
        mat[i,j] = 'sunflower'
  } else {
    mat[i,j] = 'tulip'
  }
  }
}

Here we plot the predictions to verify they are indeed accurate.

names(mat) = c('Pred Prob', 'Pred Val', 'True Val')
print(mat) # compare prediction with true value
##   Pred Prob  Pred Val  True Val
## 1 1.0000000 sunflower sunflower
## 2 0.9999982      rose      rose
## 3 0.9999924      rose      rose
## 4 1.0000000     tulip     tulip
## 5 0.9997993     tulip     tulip
## 6 0.5749264 sunflower     tulip
pred_class = mat[,2]
true_class = mat[,3]

{op <- par(mfrow = c(2, 3), pty = "s", mar = c(1, 0, 1, 0))
  for (i in 1:6) {
    plot(as.raster(test_imgs[i,,,]))
    text(x=112, y=210, labels= paste("Prediction:", pred_class[i]), col='black', cex=1.5, font=2)
    text(x=112, y=14, labels= paste("True Class:", true_class[i]), col='black', cex=1.5, font=2)
  }
par(op)}

Visualizing Intermediate Activations

We will run a test image of a rose through the first few layers in our model to plot the outputs. This will allow us to see what specific flower attributes are being picked up by the network.

model <- load_model_hdf5('base_model.h5')

img_path <- "C:/Users/ian/Documents/Projects/flowers_final/test/rose/rose675.jpg"
img <- image_load(img_path, target_size = c(224, 224))
img_tensor <- image_to_array(img)
img_tensor <- array_reshape(img_tensor, c(1, 224, 224, 3))
img_tensor <- img_tensor / 255
dim(img_tensor)
## [1]   1 224 224   3
plot(as.raster(img_tensor[1,,,]))

layer_outputs <- lapply(model$layers[1:8], function(layer) layer$output)
activation_model <- keras_model(inputs = model$input, outputs = layer_outputs)

activations <- activation_model %>% predict(img_tensor)

first_layer_activation <- activations[[1]]
dim(first_layer_activation)
## [1]   1 222 222  32
plot_channel <- function(channel) {
  rotate <- function(x) t(apply(x, 2, rev))
  image(rotate(channel), axes = FALSE, asp = 1,
  col = terrain.colors(12))
}

By plotting the intermediate activations we can see how the network is able to recognize roses, the first layers extract more general features while the bottom layers extract attributes that are more abstract and less visually interpretable.

image_size <- 64
images_per_row <- 16

for (i in 1:8) {
    layer_activation <- activations[[i]]
    layer_name <- model$layers[[i]]$name
    n_features <- dim(layer_activation)[[4]]
    n_cols <- n_features %/% images_per_row
    png(paste0("flower_activations_", i, "_", layer_name, ".png"),
    width = image_size * images_per_row,
    height = image_size * n_cols)
    op <- par(mfrow = c(n_cols, images_per_row), mai = rep_len(0.02, 4))
      for (col in 0:(n_cols-1)) {
        for (row in 0:(images_per_row-1)) {
          channel_image <- layer_activation[1,,,(col*images_per_row) + row + 1]
          plot_channel(channel_image)
        }
      }
  par(op)
  dev.off()
}
Activation Conv_2d_1

Activation Conv_2d_1

Activation Conv_2d_2

Activation Conv_2d_2

Activation Conv_2d_3

Activation Conv_2d_3

Activation Conv_2d_4

Activation Conv_2d_4